@@ -20,7 +20,7 @@ module Agents |
||
| 20 | 20 |
|
| 21 | 21 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
| 22 | 22 |
|
| 23 |
- When parsing HTML or XML, these sub-hashes specify how to extract with a `css` CSS selector and either `'text': true` or `attr` pointing to an attribute name to grab. An example: |
|
| 23 |
+ When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab. An example: |
|
| 24 | 24 |
|
| 25 | 25 |
'extract': {
|
| 26 | 26 |
'url': { 'css': "#comic img", 'attr': "src" },
|
@@ -109,21 +109,36 @@ module Agents |
||
| 109 | 109 |
else |
| 110 | 110 |
output = {}
|
| 111 | 111 |
options['extract'].each do |name, extraction_details| |
| 112 |
- result = if extraction_type == "json" |
|
| 113 |
- output[name] = Utils.values_at(doc, extraction_details['path']) |
|
| 114 |
- else |
|
| 115 |
- output[name] = doc.css(extraction_details['css']).map { |node|
|
|
| 116 |
- if extraction_details['attr'] |
|
| 117 |
- node.attr(extraction_details['attr']) |
|
| 118 |
- elsif extraction_details['text'] |
|
| 119 |
- node.text() |
|
| 120 |
- else |
|
| 121 |
- error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
| 122 |
- return |
|
| 123 |
- end |
|
| 124 |
- } |
|
| 125 |
- end |
|
| 126 |
- log "Extracting #{extraction_type} at #{extraction_details['path'] || extraction_details['css']}: #{result}"
|
|
| 112 |
+ if extraction_type == "json" |
|
| 113 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
| 114 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
|
| 115 |
+ else |
|
| 116 |
+ case |
|
| 117 |
+ when css = extraction_details['css'] |
|
| 118 |
+ nodes = doc.css(css) |
|
| 119 |
+ when xpath = extraction_details['xpath'] |
|
| 120 |
+ nodes = doc.xpath(xpath) |
|
| 121 |
+ else |
|
| 122 |
+ error "'css' or 'xpath' is required for HTML or XML extraction" |
|
| 123 |
+ return |
|
| 124 |
+ end |
|
| 125 |
+ unless Nokogiri::XML::NodeSet === nodes |
|
| 126 |
+ error "The result of HTML/XML extraction was not a NodeSet" |
|
| 127 |
+ return |
|
| 128 |
+ end |
|
| 129 |
+ result = nodes.map { |node|
|
|
| 130 |
+ if extraction_details['attr'] |
|
| 131 |
+ node.attr(extraction_details['attr']) |
|
| 132 |
+ elsif extraction_details['text'] |
|
| 133 |
+ node.text() |
|
| 134 |
+ else |
|
| 135 |
+ error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
| 136 |
+ return |
|
| 137 |
+ end |
|
| 138 |
+ } |
|
| 139 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
|
| 140 |
+ end |
|
| 141 |
+ output[name] = result |
|
| 127 | 142 |
end |
| 128 | 143 |
|
| 129 | 144 |
num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
|
@@ -228,4 +243,4 @@ module Agents |
||
| 228 | 243 |
end |
| 229 | 244 |
end |
| 230 | 245 |
end |
| 231 |
-end |
|
| 246 |
+end |
@@ -114,6 +114,19 @@ describe Agents::WebsiteAgent do |
||
| 114 | 114 |
event.payload['hovertext'].should =~ /^Biologists play reverse/ |
| 115 | 115 |
end |
| 116 | 116 |
|
| 117 |
+ it "parses XPath" do |
|
| 118 |
+ @site['extract'].each { |key, value|
|
|
| 119 |
+ value.delete('css')
|
|
| 120 |
+ value['xpath'] = "//*[@id='comic']//img" |
|
| 121 |
+ } |
|
| 122 |
+ @checker.options = @site |
|
| 123 |
+ @checker.check |
|
| 124 |
+ event = Event.last |
|
| 125 |
+ event.payload['url'].should == "http://imgs.xkcd.com/comics/evolving.png" |
|
| 126 |
+ event.payload['title'].should == "Evolving" |
|
| 127 |
+ event.payload['hovertext'].should =~ /^Biologists play reverse/ |
|
| 128 |
+ end |
|
| 129 |
+ |
|
| 117 | 130 |
it "should turn relative urls to absolute" do |
| 118 | 131 |
rel_site = {
|
| 119 | 132 |
'name' => "XKCD", |
@@ -258,4 +271,4 @@ describe Agents::WebsiteAgent do |
||
| 258 | 271 |
end |
| 259 | 272 |
end |
| 260 | 273 |
end |
| 261 |
-end |
|
| 274 |
+end |